add_month_sequence_number <- function(dat) {
  # used in 1_get_data.R
  # Order month-years and create a numeric sequence
  m <- month(dat$date)
  y <- year(dat$date)
  month_dat <- unique(ifelse(m < 10, paste0(y, '_0', m), paste0(y, '_', m)))
  month_dat <- data.frame('month' = month_dat[order(month_dat)])
  rownames(month_dat) <- NULL
  month_dat$month_num <- 1:nrow(month_dat)
  
  # Merge original data with the new dataset that has sequential month-year numbers
  dat$month <- ifelse(m < 10, paste0(y, '_0', m), paste0(y, '_', m))
  dat <- merge(x = dat, y = month_dat, by = 'month')
  dat <- data.table(dat)
  dat$date <- as.Date(dat$date)
  dat$month <- factor(dat$month)
  dat$month_num <- factor(dat$month_num)
  return(dat)
}

add_lags_leads_to_protest <- function(protest_dates, shift_range = c(-7, 7) ) {
  # Summary: add dummies for lags and leads to protest data
  # Input:  protest_dates = vector of protest dates
  # Output: data.frame protest_data (data with dummies for lags and leads)
  min_lag <- 1
  max_lag <- abs(shift_range[1])
  min_lead <- 1
  max_lead <- shift_range[2]
  
  add_lags_lead <- function(data, shift) {
    # Summary: Add new dates that correspond to lags and leads to a data.frame
    data <- as.data.frame(data)
    new_dates <- data$date[data$shift == 0] + shift
    new_dates <- new_dates[!new_dates %in% data$date] # remove duplicates
    new_dates <- data.frame('date' = new_dates)
    new_dates['old_date'] <- new_dates$date - shift
    new_dates <- merge(x = new_dates, y = data, by.x = 'old_date', by.y = 'date')
    new_dates$shift <- shift
    new_dates <- subset(new_dates, select = -old_date)
    data <- rbind(data, new_dates)
    data <- data[order(data$date),]
    rownames(data) <- NULL
    return(data)
  }
  add_dummies_for_shift <- function(data) {
    # Summary: see function name
    dum <- data[,c('date', 'shift')]
    dum <- fastDummies::dummy_cols(.data = dum, select_columns = 'shift')
    names(dum) <- gsub(pattern = 'shift_-', replacement = 'protest_lag', x = names(dum))
    names(dum) <- gsub(pattern = 'shift_0', replacement = 'protest', x = names(dum))
    names(dum) <- gsub(pattern = 'shift_', replacement = 'protest_lead', x = names(dum))
    dum <- subset(dum, select = -shift)
    data <- merge(x = data, y = dum, by = 'date')
    if ('shift' %in% names(data)) {
      data <- subset(data, select = -shift)
    }
    return(data)
  }
  
  protest_data <- data.frame('date' = as.Date(protest_dates), 'shift' = 0, stringsAsFactors = F)
  
  shifts <- seq(from = shift_range[1], to = shift_range[2], by = 1)
  shifts <- shifts[shifts != 0]
  for (s in shifts) {
    protest_data <- add_lags_lead(data = protest_data, shift = s)
  }
  protest_data <- add_dummies_for_shift(protest_data)
  
  return(protest_data)
}

add_protest_data <- function(tw_data, protest_data ) {
  # Summary: merge protest data with lags and leads with tw_data
  # Input:  tw_data; protest_data from add_lags_leads_to_protest()
  # Output: tw_data merged with protest_data
  tw_data <- as.data.frame(merge(x = tw_data, y = protest_data, by = 'date', all.x = T))
  # replace NAs created after merging with 0s
  names_for_looping <- names(tw_data)[grepl(pattern = 'protest', x = names(tw_data))]
  for (col in names_for_looping) {
    tw_data[is.na(tw_data[,col]),col] <- 0 # replace NAs with 0s
  }
  tw_data <- data.table(tw_data)
  
  return(tw_data)
}
 
add_cumulative_lags_leads <- function(tw_data, shift_range = c(-7, 7))  {
  # Summary: Creates cumulative lags and leads for protest dates
  # Input: tw_data with protests from add_protest_data()
  # Output: tw_data with cumulative lags and leads
  min_lag <- 1
  max_lag <- abs(shift_range[1])
  min_lead <- 1
  max_lead <- shift_range[2]
  # Add new variables for cumulative lags/leads
  for (r in shift_range[1]:shift_range[2]) {
    if (r < 0) {
      tw_data[[paste0('protest_lag', abs(r), '_cumul')]] <- tw_data[[paste0('protest_lag', abs(r))]]
    }
    if (r > 0) {
      tw_data[[paste0('protest_lead', r, '_cumul')]] <- tw_data[[paste0('protest_lead', r)]]
    }
  }
  # Fill up the new cumulative lags variables
  tw_data$protest_lag1_cumul[tw_data$protest == 1] <- 1 # for cumul lags, add 1s to protest days themselves
    for (r in 2:max_lag) { # Insert 1s for all dates where previous cumul lags equal 1
    tw_data[[paste0('protest_lag', r, '_cumul')]][tw_data[[paste0('protest_lag', r-1, '_cumul')]] == 1] <- 1
  }
  
  # Fill up the new cumulative leads variables
  tw_data$protest_lead1_cumul[tw_data$protest == 1] <- 1 # for cumul leads, add 1s to protest days themselves
  for (r in 2:max_lead) { # Insert 1s for all dates where previous cumul leads equal 1
    tw_data[[paste0('protest_lead', r, '_cumul')]][tw_data[[paste0('protest_lead', r-1, '_cumul')]] == 1] <- 1
  }
  
  leads_to_remove <- names(tw_data)[grepl(pattern = 'protest_lead', x = names(tw_data)) & !grepl(pattern = 'cumul', x = names(tw_data) )]
  lags_to_remove <- names(tw_data)[grepl(pattern = 'protest_lag', x = names(tw_data)) & !grepl(pattern = 'cumul', x = names(tw_data) )]
  
  tw_data <- within(tw_data, rm(list = leads_to_remove))
  tw_data <- within(tw_data, rm(list = lags_to_remove))

  return(tw_data)
}

get_moving_median <- function(data, variable, window_size = 14) {
  # Summary: compute the moving median of the total number of tweets posted by opposition activists
  # Input: number of tweets for each opposition activist from a sample
  # Output: data.frame with the moving median (movmed)
  data <- as.data.frame(data)
  movmed <- sapply(0:(nrow(data)-window_size), function(k) median( data[(1+k):(window_size+k),][[variable]] ) )
  # movmed has fewer elements than the original time series. For proper comparison, either truncate TS, or pad it
  # We choose padding: use the same moving median value as was found for the 1st and the last available TS point with the rest of the front and back tails of TS
  padsize = floor( (nrow(data) - length(movmed) )/2 ) # padding is symmetrical, so divide the difference in the TS size and the moving median vector size by 2
  repl <- c(rep(movmed[1], padsize), movmed, rep(movmed[length(movmed)], padsize) ) # pad the moving median vector
  # Add moving median values to the dataset
  if ( nrow(data) == length(repl) ) {
    data[paste0(variable, '_movmed')] <- repl
  } else {
    data[paste0(variable, '_movmed')] <- c(repl, repl[length(repl)])
  }
  return(data)
}

make_data_reg <- function(data, max_lag = 7, max_lead = 7, placebo_dates = NA, opposition_data = NA, sentiment = FALSE) {
  # Summary: Set up data for regression analysis
  # Input: data_merged
  # Output: reg_data, reg_data_opp_sets
  curst <- proc.time()
  ### Compute dependent variables for each bot-day observation
  if (!sentiment) {
    dat0 <- data[, .('ntw' = .N, 
                     'put_sum' = sum(grepl(pattern = '((П|п)утин)', x = lemma)), 
                     'rtd' = length(unique(rt_source)), 
                     'nav_sum' = sum(grepl(pattern = '(Н|н)аваль', x = lemma)), 
                     'month' = unique(month_num), 
                     'protest' = unique(protest) ), by = .(userid, date)]
  } else {
    dat0 <- data[, .('ntw' = .N, 
                     'put_sum' = sum(grepl(pattern = '((П|п)утин)', x = lemma)), 
                     'rtd' = length(unique(rt_source)), 
                     'nav_sum' = sum(grepl(pattern = '(Н|н)аваль', x = lemma)), 
                     'month' = unique(month_num), 
                     'protest' = sum(protest)/.N,
                     'put_sum_posrusentilex' = sum(grepl(pattern = '(П|п)утин', x = lemma) & (rusentilex_pos > rusentilex_neg)),
                     'put_sum_negrusentilex' = sum(grepl(pattern = '(П|п)утин', x = lemma) & (rusentilex_pos < rusentilex_neg)),
                     'nav_sum_posrusentilex' = sum(grepl(pattern = '(Н|н)аваль', x = lemma) & (rusentilex_pos > rusentilex_neg)),
                     'nav_sum_negrusentilex' = sum(grepl(pattern = '(Н|н)аваль', x = lemma) & (rusentilex_pos < rusentilex_neg))
    ), by = .(userid, date)]
  }
  cat('Done: step 1\n')
  
  ### Merge dat0 with data on lags and leads
  data1 <- as.data.frame(data)
  # Lags
  lag_names <- names(data1)[grepl(pattern = '(date)|(lag)', x = names(data1))]
  lag_dat <- data1[, lag_names]
  lag_dat <- lag_dat[!duplicated(lag_dat),]
  # Leads
  lead_names <- names(data1)[grepl(pattern = '(date)|(lead)', x = names(data1))]
  lead_dat <- data1[, lead_names]
  lead_dat <- lead_dat[!duplicated(lead_dat),]
  # Merge
  dat0 <- merge(x = dat0, y = lag_dat, by = 'date')
  dat0 <- merge(x = dat0, y = lead_dat, by = 'date')
  
  ### Add a dummy for peak days to dat0
  dat0$peak5 <- ifelse(dat0$date %in% subset(opposition_data, peak5 == 1)$date, 1, 0)
  cat('Found', length(unique(dat0$date[dat0$peak5 == 1])), 'dates with peaks\n')
  cat('Done: step 2\n')
  
  ### Make lags 1-7 for online peaks
  mypeaks <- as.Date(peak_data$date[peak_data$peak5 == 1])
  peak_lags_list <- lapply(1:max_lag, function(k) mypeaks - k )
  peak_leads_list <- lapply(1:max_lead, function(k) mypeaks + k )
  # Make a data.frame w/ dummies for peak lags 1-7
  all_lag_dates <- data.frame('date' = unique(dat0$date))
  all_lag_dates$peak_lag0_cumul <- ifelse(all_lag_dates$date %in% mypeaks, 1, 0)
  for (i in 1:max_lag) {
    all_lag_dates[,i+2] <- ifelse(all_lag_dates$date %in% peak_lags_list[[i]], 1, 0)
  }
  names(all_lag_dates)[3:(max_lag+2)] <- paste0('peak_lag', 1:max_lag, '_cumul')
  
  for (r in 1:max_lag) { # Insert 1s for all dates where previous cumul lags equal 1: if on dayX peak_lag0=1, it should be 1 for peak_lag1 too, as peak_lag1 includes peak_lag0
    all_lag_dates[[paste0('peak_lag', r, '_cumul')]][all_lag_dates[[paste0('peak_lag', r-1, '_cumul')]] == 1] <- 1
  }
  
  all_lag_dates <- subset(all_lag_dates, select = -peak_lag0_cumul)
  
  cat('Done: step 3\n')
  
  ### Make leads 1-7 for online peaks
  peak_leads_list <- lapply(1:max_lead, function(k) mypeaks + k )
  
  all_lead_dates <- data.frame('date' = unique(dat0$date))
  all_lead_dates$peak_lead0_cumul <- ifelse(all_lead_dates$date %in% mypeaks, 1 ,0)
  for (i in 1:max_lead) {
    all_lead_dates[,i+2] <- ifelse(all_lead_dates$date %in% peak_leads_list[[i]], 1 ,0)
  }
  names(all_lead_dates)[3:(max_lead+2)] <- paste0('peak_lead', 1:max_lead, '_cumul')
  
  for (r in 1:max_lead) { # Insert 1s for all dates where previous cumul leads equal 1
    all_lead_dates[[paste0('peak_lead', r, '_cumul')]][all_lead_dates[[paste0('peak_lead', r-1, '_cumul')]] == 1] <- 1
  }
  all_lead_dates <- subset(all_lead_dates, select = -peak_lead0_cumul)
  
  ### Merge peak lags and leads with the rest of data
  dat0 <- merge(x = dat0, y = all_lag_dates, by = 'date')
  dat0 <- merge(x = dat0, y = all_lead_dates, by = 'date')
  cat('Done: step 4\n')
  
  ### Make dummies for placebo days [placebo_dates]
  if ( is.list(placebo_dates) ) {
    for (j in 1:length(placebo_dates)) {
      dat0[,paste0('placebo', j)] <- ifelse(dat0$date %in% placebo_dates[[j]], 1, 0)
    }
  } else {
    dat0[,'placebo'] <- ifelse(dat0$date %in% placebo_dates, 1, 0)
  }
  curfin <- proc.time()
  cat('Total time:', (curfin[3] - curst[3]) / 60, ' min\n')
  return(dat0)
}

setup_data_sapd <- function(data) {
  dat0 <- as.data.frame( data)
  ##### Relabel bot accounts for Stan ( otherwise, array dimensions wouldn't match )
  acc_dat <- data.frame(userid = unique(dat0$userid))
  acc_dat$id_num <- 1:nrow(acc_dat)
  dat0 <- merge(x = dat0, y = acc_dat, by = 'userid')
  dat0$protest_var <- factor(dat0$protest)
  dat0$peak_var <- factor(dat0$peak5)
  dat0$placebo_var <- factor(dat0$placebo)
  return(dat0)
}

setup_data_analysis <- function(data, curprotest = 'protest', curpeak = 'peak5', curplacebo = 'placebo') {
  # Summary: setup data.frame for regression analysis
  # Input: data.frame from make_data_reg()
  # Output: updated data.frame with renamed explanatory variables and added sequential userids
  dat0 <- as.data.frame( data)
  # Make sequential numbers for userids
  acc_dat <- data.frame(userid = unique(dat0$userid))
  acc_dat$id_num <- 1:nrow(acc_dat)
  # Merge original data with sequential userids numbers
  dat0 <- merge(x = dat0, y = acc_dat, by = 'userid')
  # Rename key explanatory variables for consistency across models
  dat0$protest_var <- factor(dat0[[curprotest]])
  dat0$peak_var <- factor(dat0[[curpeak]])
  dat0$placebo_var <- factor(dat0[[curplacebo]])
  return(dat0)
}

make_balanced_data_for_sapd <- function(model, data) {
  ###Previous name: make_maref_data_balanced
  # Set up a balanced dataset for simulated data
  sim_data <- expand.grid('id_num' = min(as.numeric(data$id_num)):max(as.numeric(data$id_num)), 
                          'month' = min(as.numeric(data$month)):max(as.numeric(data$month)))
  
  # Extract user-specific random effects 
  tau <- ranef(model)$id_num
  if (ncol(tau) == 1) {
    # id_const is a user-specific random effect for the intercept
    colnames(tau) <- 'id_const'
  } else {
    # id_protest etc are user-specific random effects for respective slopes
    colnames(tau) <- c('id_const', 'id_protest', 'id_peak', 'id_placebo') 
  }
  tau$id_num <- 1:nrow(tau)
  
  # Add user-specific random effects to sim_data
  sim_data <- merge(x = sim_data, y = tau, by = 'id_num')
  
  # Compute and add month-specific random effects if relevant
  if ('month' %in% names(ranef(model))) {
    # Extract month-specific random effects 
    gamma <- ranef(model)$month
    colnames(gamma) <- 'month_re'
    gamma$month <- 1:nrow(gamma)
    # Add month-specific random effects to sim_data
    sim_data <- merge(x = sim_data, y = gamma, by = 'month')
  }
  
  # Add fixed effects part
  if (!"(Intercept)" %in% names(fixef(model))) {
    cat('WARNING: NO (Intercept) in fixef\n')
  }
  if (!"protest_var1" %in% names(fixef(model))) {
    cat('WARNING: NO protest_var1 in fixef\n')
  }
  if (!"peak_var1" %in% names(fixef(model))) {
    cat('WARNING: NO peak_var1 in fixef\n')
  }
  if (!"placebo_var1" %in% names(fixef(model))) {
    cat('WARNING: NO placebo_var1 in fixef\n')
  }
  sim_data$const <- fixef(model)["(Intercept)"]
  sim_data$const_protest <- fixef(model)["protest_var1"]
  sim_data$const_peak <- fixef(model)["peak_var1"]
  sim_data$const_placebo <- fixef(model)["placebo_var1"]
  sim_data$treat_var <- factor(x = 1, levels = c(0,1))
  return(sim_data)
}

compute_sapd <- function(models, reg_datasets, BONFE_NUM = length(models)) {
  # Input: vector of model names to consider 
  # Output: list with SAPD point and interval estimates and datplot data.frame for further plotting 
  st <- proc.time()
  sapd_results <- list()
  
  for (mo in names(models)) {
    sapd_results[[mo]] <- list()
    cat('Current model name:', mo, '\n')
    
    # Select the appropriate dataset and model
    if (mo %in% c("ntw", "rtd", "put", "nav")) {
      current_data <- reg_datasets[['main']]
      current_model <- models[[mo]]
    } else {
      # For Appendix L
      if (grepl(pattern = '_sent_', x = mo)) {
        current_data <- reg_datasets[['sent']]
        current_model <- models[[mo]]
      } else {
        # For Appendix K
        if (grepl(pattern = 'extended', x = mo)) {
          current_data <- reg_datasets[['extended']]
          current_model <- models[[mo]]
        } else {
          # For Appendix K (Subsets of activists)
          if (grepl(pattern = 'subset', x = mo)) {
            subset_num <- gsub(pattern = '(ntw)|(rtd)|(put)|(nav)|(_subset)', x = mo, replacement = '')
            current_data <- reg_datasets[[paste0('subset', subset_num)]]
            current_model <- models[[mo]]
          } else {
            if (grepl(pattern = 'app_m', x = mo)) {
              current_data <- reg_datasets[['main']]
              current_model <- models[[mo]]
            } else {
              # For main (lags and leads)
              dat_name <- gsub(pattern = '(_ntw)|(_rtd)|(_put)|(_nav)', replacement = '', x = mo)
              if (grepl(pattern = 'lag', x = dat_name)) {
                current_data <- reg_datasets[[dat_name]]
                current_model <- models[[mo]]
                }
              if (grepl(pattern = 'lead', x = dat_name)) {
                current_data <- reg_datasets[[dat_name]]
                current_model <- models[[mo]]
              }
            }
          }
        }
      }
    }
    
    simd <- make_balanced_data_for_sapd(model = current_model, data = current_data) # make balanced data for computing SAPD
    simb <- heavy::rmnorm(n = 1000, mean = fixef(current_model), Sigma = vcov(current_model)) # simulate coefficients
    cat('step 1 done\n')
    
    # Compute SAPD (point estimate and CI) for protests
    sapd_results[[mo]]$protest_pointest <- with(simd, mean(exp(const + id_const + const_protest + id_protest + month_re) - exp(const + id_const + month_re) ) )
    sapd_results[[mo]]$protest_simuls <- sapply(1:nrow(simb), function(k) with(mutate(simd, const = simb[k,1], const_protest = simb[k,2]), mean(exp(const + id_const + const_protest + id_protest + month_re) - exp(const + id_const + month_re) ) ) )
    sapd_results[[mo]]$protest_ci <- coda::HPDinterval(coda::as.mcmc(sapd_results[[mo]]$protest_simuls), prob = 1 - 0.05/BONFE_NUM)
    cat('step 2 done\n')
    
    # Compute SAPD (point estimate and CI) for peaks
    sapd_results[[mo]]$peak_pointest <- with(simd, mean(exp(const + id_const + const_peak + id_peak + month_re) - exp(const + id_const + month_re) ) )
    sapd_results[[mo]]$peak_simuls <- sapply(1:nrow(simb), function(k) with(mutate(simd, const = simb[k,1], const_peak = simb[k,3]), mean(exp(const + id_const + const_peak + id_peak + month_re) - exp(const + id_const + month_re) ) ) )
    sapd_results[[mo]]$peak_ci <- coda::HPDinterval(coda::as.mcmc(sapd_results[[mo]]$peak_simuls), prob = 1 - 0.05/BONFE_NUM)
    cat('step 3 done\n')
    
    # Compute SAPD (point estimate and CI) for placebo
    sapd_results[[mo]]$placebo_pointest <- with(simd, mean(exp(const + id_const + const_placebo + id_placebo + month_re) - exp(const + id_const + month_re) ) )
    sapd_results[[mo]]$placebo_simuls <- sapply(1:nrow(simb), function(k) with(mutate(simd, const = simb[k,1], const_placebo = simb[k,4]), mean(exp(const + id_const + const_placebo + id_placebo + month_re) - exp(const + id_const + month_re) ) ) )
    sapd_results[[mo]]$placebo_ci <- coda::HPDinterval(coda::as.mcmc(sapd_results[[mo]]$placebo_simuls), prob = 1 - 0.05/BONFE_NUM)
    cat('step 4 done\n')
    
    # Store results
    sapd_results[[mo]]$datplot <- data.frame('sapd' = c(sapd_results[[mo]]$protest_pointest, sapd_results[[mo]]$peak_pointest, sapd_results[[mo]]$placebo_pointest),
                                             'lb' = c(sapd_results[[mo]]$protest_ci[1,1], sapd_results[[mo]]$peak_ci[1,1], sapd_results[[mo]]$placebo_ci[1,1]),
                                             'ub' = c(sapd_results[[mo]]$protest_ci[1,2], sapd_results[[mo]]$peak_ci[1,2], sapd_results[[mo]]$placebo_ci[1,2]),
                                             'dv' = mo,
                                             'event' = c('protest days', 'online spikes', '10 placebo days') )
    cat('step 5 done\n')
  }
  fin <- proc.time()
  cat((fin[3] - st[3])/60, ' mins\n') # 33 min
  
  return(sapd_results)
}

make_data_for_plot <- function(dat, block, shift = 3, lags = T, leads = T) {
  # Summary: sets up data for making Figures 2-5 in main analysis
  # Input: datplot
  # Output: data.frame for ggplot
  d <- subset(dat, grepl(pattern = block, x = dv))
  
  main <- subset(d, dv == block)
  
  if (lags) {
    peak_lag <- subset(d, grepl(pattern = 'lag', x = dv) & grepl(pattern = 'peak', x = dv) & event == 'online spikes')
    # clean the event variable: it should become numeric and equal to the value of the shift 
    peak_lag$event <- as.numeric(gsub(pattern = paste0('(peak_lag)|(_', block, ')'), replacement = '', x = peak_lag$dv))
    # select lags <= required shift
    peak_lag <- subset(peak_lag, event <= shift)
    peak_lag$event <- paste0('spikes: 0-', peak_lag$event, ' days before')
    
    protest_lag <- subset(d, grepl(pattern = 'lag', x = dv) & grepl(pattern = 'protest', x = dv) & event == 'protest days')
    # clean the event variable: it should become numeric and equal to the value of the shift 
    protest_lag$event <- as.numeric(gsub(pattern = paste0('(protest_lag)|(_', block, ')'), replacement = '', x = protest_lag$dv))
    # select lags <= required shift
    protest_lag <- subset(protest_lag, event <= shift)
    protest_lag$event <- paste0('protest: 0-', protest_lag$event, ' days before')
    
    main <- rbind(main, peak_lag, protest_lag)
  }
  
  if (leads) {
    # do the same pre-processing as for lags
    peak_lead <- subset(d, grepl(pattern = 'lead', x = dv) & grepl(pattern = 'peak', x = dv) & event == 'online spikes')
    peak_lead$event <- as.numeric(gsub(pattern = paste0('(peak_lead)|(_', block, ')'), replacement = '', x = peak_lead$dv))
    peak_lead <- subset(peak_lead, event <= shift)
    peak_lead$event <- paste0('spikes: 0-', peak_lead$event, ' days after')
    
    protest_lead <- subset(d, grepl(pattern = 'lead', x = dv) & grepl(pattern = 'protest', x = dv) & event == 'protest days')
    protest_lead$event <- as.numeric(gsub(pattern = paste0('(protest_lead)|(_', block, ')'), replacement = '', x = protest_lead$dv))
    protest_lead <- subset(protest_lead, event <= shift)
    protest_lead$event <- paste0('protest: 0-', protest_lead$event, ' days after')
    
    main <- rbind(main, peak_lead, protest_lead)
  }
  
  d <- main
  # New: to increase the gap between spikes and protest
  newd <- data.frame('sapd' = c(NA, NA), 'lb' = c(NA, NA), 'ub' = c(NA, NA), 'dv' = c(NA, NA), 'event' = c('holder_spikes', 'holder_protest'))
  d <- rbind(d, newd)
  # End new
  d$event <- factor(d$event, levels = c('10 placebo days', 'holder_protest',
                                        paste0('protest: 0-', seq(from=shift, to=1, by=-1),' days after'),
                                        'protest days',
                                        paste0('protest: 0-', seq(from=1, to=shift, by=1),' days before'),
                                        
                                        'holder_spikes',
                                        paste0('spikes: 0-', seq(from=shift, to=1, by=-1),' days after'),
                                        'online spikes',
                                        paste0('spikes: 0-', seq(from=1, to=shift, by=1),' days before')
  ))
  d$event_num <- as.numeric(d$event)
  d <- d[order(d$event_num),]
  rownames(d) <- NULL
  d <- subset(d, select = -event_num)
  
  return(d)
}

make_quick_plot <- function(data, title = NULL, xaxis_label = NULL, xmax = 2) {
  # Summary: Makes plots for main analysis (Fig 2-5) and App.F (Fig. 8-15)
  # Input: Pre-processed (see make_data_for_plot() ) datplot
  # Output: ggplot
  pp <- ggplot(data, aes(x = sapd,
                         y = event,
                         xmin = lb,
                         xmax = ub)) +
    geom_segment(data = data, aes(x = lb,
                                  y = event,
                                  xend = ub,
                                  yend = event), size = 1, color = ifelse(data$event %in% c('10 placebo days', 'protest days', 'online spikes'), 'black', 'gray40') ) +
    geom_point(size = 4, col = ifelse(data$event %in% c('10 placebo days', 'protest days', 'online spikes'), 'black', ifelse(grepl(pattern = 'holder', x = data$event), 'white', 'gray40'))) +
    theme_bw()  +
    theme(axis.text.y = element_text(color = ifelse(data$event %in% c('10 placebo days', 'protest days', 'online spikes'), 'black', ifelse(grepl(pattern = 'holder', x = data$event), 'white', 'gray40')), size = 15),
          axis.text.x = element_text(color = 'black', size = 15),
          axis.title = element_text(color = 'black', size = 15),
          axis.ticks.y = element_blank(),
          plot.title = element_text(hjust = 0.5, size = 15),
          plot.margin = margin(0, 0.5, 0.5, 0, "cm") ) +
    geom_vline(xintercept = 0, lty = 2) +
    xlim(-0.5, xmax) +
    ylab('')
  if (!is.null(title)) {
    pp <- pp + ggtitle(title)
  }
  
  if (!is.null(xaxis_label)) {
    pp <- pp + xlab(xaxis_label)
  } else {
    pp <- pp + xlab('Simulated average predictive difference')
  }
  return(pp)
}

get_protest_dates <- function(main = T) {
  # If protest data is not in the namespace, load it
  if (!'protest' %in% ls()) {
    protest <- list()
    protest$raw_data <- read.xlsx(xlsxFile = 'data/protest_data3.xlsx')
  }
  if (main) {
    protest_dates <- as.character( subset(protest$raw_data, dataset == 'icews')$date )
  } else {
    protest_dates <- as.character( protest$raw_data$date )
  }
  
  protest_dates <- protest_dates[order(protest_dates, decreasing = F)]
  return(protest_dates)
}

get_peak_dates <- function() {
  # Summary: returns dates of online peaks (spikes in online opposition activity)
  if (! 'peak_data' %in% ls() ) {
    optw <- get(load('data/opposition.RData'))
    
    agg_op <- optw[, .('opsum' = .N), by = 'date']
    agg_op <- get_moving_median(data = agg_op, variable = 'opsum', window_size = 14)
    agg_op$date <- as.Date(agg_op$date)
    agg_op$peak5 <- ifelse(agg_op$opsum/agg_op$opsum_movmed >= 5, 1, 0)
    
    peak_data <- subset(agg_op, peak5 == 1, select = c(date, opsum, peak5))
  }
    peak_data <- peak_data[order(peak_data$date, decreasing = F),]
    rownames(peak_data) <- NULL
    peak_dates <- as.character(peak_data$date)
    return(peak_dates)
}

make_protest_numbers <- function(data, protest_dates) {
  # Summary: Add protest_num variable to data.frame
  protest_dates <- as.character(protest_dates)
  data$date <- as.character(data$date)
  data$protest_num <- 0
  for (i in 1:length(protest_dates)) {
    data$protest_num[data$date == protest_dates[i]] <- i
  }
  return(data)
} 

make_peak_numbers <- function(data, peak_dates) {
  # Summary: Add peak_num variable to data.frame
  peak_dates <- as.character(peak_dates)
  data$date <- as.character(data$date)
  data$peak_num <- 0
  for (i in 1:length(peak_dates)) {
    data$peak_num[data$date == peak_dates[i]] <- i
  }
  return(data)
} 

get_median_topic_prob_by_event <- function(topic_data, protest = T) {
  # Summary: computes the median probabilities for each topic across all tweets posted on a given day (protest or spike)
  probout <- list()
  n_topics = 35
  if (protest) {
    N_events = 27 # 27 is the number of protests
    for (i in 1:N_events) {
      d <- subset(topic_data, protest_num == i, select = -c(date, lemma, protest, protest_num))
      probout[[i]] <- round( apply(d, MARGIN=2, FUN = median), 3)
      Sys.sleep(0.5)
    }
  } else {
    N_events = 24 # 24 is the number of peaks
    for (i in 1:N_events) {
      d <- subset(topic_data, peak_num == i, select = -c(date, lemma, peak5, peak_num))
      probout[[i]] <- round( apply(d, MARGIN=2, FUN = median), 3)
      Sys.sleep(1)
    }
  }
  probout <- data.frame('value' = as.vector(unlist(probout)), 
                        'event' = factor(rep(1:N_events, each = n_topics)),
                        'topic' = factor(rep(1:n_topics, N_events)), stringsAsFactors = F)
  return(probout)
}

prepare_data_opp_sets <- function(data) {
  # Summary: Fixes variable names according to the paper
  # Input: datplot with SAPD
  # Output: list of datplots with SAPD and correct variable names
  
  # Extract subset no.
  data$subset_num <- as.numeric( gsub(pattern = '(ntw)|(rtd)|(put)|(nav)|(_subset)', replacement = '', x = data$dv) )
  # Make the event variable: spikes (subset X) for subsets or spikes (main) for the main results
  data$event_true <- ifelse(data$event == 'online spikes' & !is.na(data$subset_num), 
                            paste0('spikes (subset ', data$subset_num, ')'), 
                            ifelse(data$event == 'online spikes' & is.na(data$subset_num), 
                                   'spikes (main)', data$event ) )
  # Organize the output as a list with DV-specific entries
  output <- list()
  output$ntw <- subset(data, grepl(pattern = 'ntw', x = dv) & grepl(pattern = 'spikes', x = event_true), select = c(sapd, lb, ub, dv, event_true) )
  rownames(output$ntw) <- NULL
  output$ntw$event_true <- factor(x = output$ntw$event_true, levels = c(paste0('spikes (subset ', 10:1, ')'), 'spikes (main)'))
  
  output$rtd <- subset(data, grepl(pattern = 'rtd', x = dv) & grepl(pattern = 'spikes', x = event_true), select = c(sapd, lb, ub, dv, event_true) )
  rownames(output$rtd) <- NULL
  output$rtd$event_true <- factor(x = output$rtd$event_true, levels = c(paste0('spikes (subset ', 10:1, ')'), 'spikes (main)'))
  
  output$put <- subset(data, grepl(pattern = 'put', x = dv) & grepl(pattern = 'spikes', x = event_true), select = c(sapd, lb, ub, dv, event_true) )
  rownames(output$put) <- NULL
  output$put$event_true <- factor(x = output$put$event_true, levels = c(paste0('spikes (subset ', 10:1, ')'), 'spikes (main)'))
  
  output$nav <- subset(data, grepl(pattern = 'nav', x = dv) & grepl(pattern = 'spikes', x = event_true), select = c(sapd, lb, ub, dv, event_true) )
  rownames(output$nav) <- NULL
  output$nav$event_true <- factor(x = output$nav$event_true, levels = c(paste0('spikes (subset ', 10:1, ')'), 'spikes (main)'))
  
  return(output)
}

make_quick_plot_opp <- function(data, title = NULL, xaxis_label = NULL, xmax = 1) {
  # Summary: Makes plots for subsets of opposition leaders (Fig 24-27)
  # Input: Pre-process (see prepare_data_opp_sets() ) datplot
  # Output: ggplot
  text_colors <- c(rep('gray40', 10), 'black')
  
  pp <- ggplot(data, aes(x = sapd,
                         y = event_true,
                         xmin = lb,
                         xmax = ub)) +
    geom_segment(data = data, aes(x = lb,
                                  y = event_true,
                                  xend = ub,
                                  yend = event_true), size = 1, color = ifelse(data$event_true %in% c('spikes (main)'), 'black', 'gray40') ) +
    geom_point(size = 4, col = ifelse(data$event_true %in% c('spikes (main)'), 'black', 'gray40')) +
    theme_bw()  +
    theme(axis.text.y = element_text(color = text_colors, size = 15),
          axis.text.x = element_text(color = 'black', size = 15),
          axis.title = element_text(color = 'black', size = 15),
          plot.title = element_text(hjust = 0.5, size = 15)) +
    geom_vline(xintercept = 0, lty = 2) +
    xlim(-0.5, xmax) +
    ylab('')
  if (!is.null(title)) {
    pp <- pp + ggtitle(title)
  }
  
  if (!is.null(xaxis_label)) {
    pp <- pp + xlab(xaxis_label)
  } else {
    pp <- pp + xlab('Simulated average predictive difference')
  }
  return(pp)
}

make_data_for_sent <- function(data, block) {
  # Summary: pre-process data for making Fig 28 and 29
  # Input: datplot_sent
  # Output: data.frame for ggplot
  
  data <- subset(data, grepl(pattern = block, x = data$dv))
  
  data$event_true <- ifelse(!grepl(pattern = 'sent', x = data$dv), paste0(data$event, ' (main)'),
                            ifelse(grepl(pattern = 'dif', x = data$dv), paste0(data$event, ' (sent. dif.)'), paste0(data$event, ' (sent.)')) )
  
  data$event_true <- factor(x = data$event_true, levels = paste0(rep(x = c('10 placebo days', 'online spikes', 'protest days'), each = 3), 
                                                                 rep(x = c(' (main)', ' (sent.)', ' (sent. dif.)'), 3) ) )
  
  data <- subset(data, select = c(sapd, lb, ub, event_true))
  rownames(data) <- NULL
  
  return(data)
}

make_quick_plot_sent <- function(data, title = NULL, xaxis_label = NULL, xmax = 1) {
  # Summary: use ggplot to make Fig 28 and 29
  # Input: pre-processed data from make_data_for_sent()
  # Output: ggplot
  text_colors <- rep(x = c('black', 'gray40', 'gray40'), 3)
  
  pp <- ggplot(data, aes(x = sapd,
                         y = event_true,
                         xmin = lb,
                         xmax = ub)) +
    geom_segment(data = data, aes(x = lb,
                                  y = event_true,
                                  xend = ub,
                                  yend = event_true), size = 1, color = ifelse(grepl(pattern = 'main', x = data$event_true), 'black', 'gray40') ) +
    geom_point(size = 4, col = ifelse(grepl(pattern = 'main', x = data$event_true), 'black', 'gray40') ) +
    theme_bw()  +
    theme(axis.text.y = element_text(color = text_colors, size = 15),
          axis.text.x = element_text(color = 'black', size = 15),
          axis.title = element_text(color = 'black', size = 15),
          plot.title = element_text(hjust = 0.5, size = 15)) +
    geom_vline(xintercept = 0, lty = 2) +
    xlim(-0.5, xmax) +
    ylab('')
  if (!is.null(title)) {
    pp <- pp + ggtitle(title)
  }
  
  if (!is.null(xaxis_label)) {
    pp <- pp + xlab(xaxis_label)
  } else {
    pp <- pp + xlab('Simulated average predictive difference')
  }
  return(pp)
}

make_data_for_plot_extended <- function(data, block) {
  # Summary: pre-process data for making Fig 20-  23
  # Input: datplot_extended
  # Output: data.frame for ggplot
  data <- subset(data, grepl(pattern = block, x = dv))
  data$event <- ifelse(grepl(pattern = 'extended', x = data$dv), paste0(data$event, ' (aug.)'), paste0(data$event, ' (main)'))
  data$event <- factor(x = data$event, levels = paste0( rep(c('10 placebo days', 'online spikes', 'protest days'), each = 2), rep(c(' (main)', ' (aug.)'), 2)) )
  data <- subset(data, select = -dv)
  
  return(data)
}

make_quick_plot_extended <- function(data, title = NULL, xaxis_label = NULL, xmax = 1) {
  # Summary: use ggplot to make Fig 20 - 23
  # Input: pre-processed data from make_data_for_plot_extended()
  # Output: ggplot
  
  mycols <- rep(x = c('gray40', 'black'), times = 3)
  
  pp <- ggplot(data, aes(x = sapd,
                         y = event,
                         xmin = lb,
                         xmax = ub)) +
    geom_segment(data = data, aes(x = lb,
                                  y = event,
                                  xend = ub,
                                  yend = event), size = 1, color = ifelse(grepl(pattern = 'aug', x = data$event), 'black', 'gray40') ) +
    geom_point(size = 4, col = ifelse(grepl(pattern = 'aug', x = data$event), 'black', 'gray40')) +
    theme_bw()  +
    theme(axis.text.y = element_text(color = mycols, size = 15),
          axis.text.x = element_text(color = 'black', size = 15),
          axis.title = element_text(color = 'black', size = 15),
          plot.title = element_text(hjust = 0.5, size = 15)) +
    geom_vline(xintercept = 0, lty = 2) +
    xlim(-0.5, xmax) +
    ylab('')
  if (!is.null(title)) {
    pp <- pp + ggtitle(title)
  }
  
  if (!is.null(xaxis_label)) {
    pp <- pp + xlab(xaxis_label)
  } else {
    pp <- pp + xlab('Simulated average predictive difference')
  }
  return(pp)
}

make_data_for_plot_app_m <- function(data) {
  # Summary: Fixes variable names according to the paper
  # Input: datplot with SAPD
  # Output: datplot with SAPD and correct variable names
  datplot_appm <- subset(data, dv == 'rtd' | dv == 'model_app_m')
  datplot_appm$event <- c('protest days (main)', 'online spikes (main)', '10 placebo days (main)',
                          'protest days (control)', 'online spikes (control)', '10 placebo days (control)' )
  
  datplot_appm <- rbind(datplot_appm, data.frame('sapd' = c(NA,NA), 'lb' = c(NA,NA), 'ub' = c(NA,NA), 'dv' = c('holder1','holder2'), 'event'= c('holder1','holder2') ))
  
  datplot_appm$event <- factor(x = datplot_appm$event, levels = c('10 placebo days (main)', '10 placebo days (control)', 'holder2',
                                                                  'online spikes (main)', 'online spikes (control)', 'holder1',
                                                                  'protest days (main)', 'protest days (control)' ))
  return(datplot_appm)
}


make_quick_plot_app_m <- function(data) {
  # Summary: use ggplot to make Fig 30
  # Input: pre-processed data from make_data_for_plot_app_m()
  # Output: ggplot
  mycols = c('gray40', 'black', 'white', 'gray40', 'black', 'white', 'gray40', 'black')
  pp <- ggplot(data, aes(x = sapd,
                         y = event,
                         xmin = lb,
                         xmax = ub)) +
    geom_segment(data = data, aes(x = lb,
                                  y = event,
                                  xend = ub,
                                  yend = event), size = 1, color = ifelse(grepl(pattern = 'control', x = data$event), 'black', 'gray40') ) +
    geom_point(size = 4, col = ifelse(grepl(pattern = 'control', x = data$event), 'black', ifelse(grepl(pattern = 'holder', x = data$event), 'white', 'gray40'))) +
    theme_bw()  +
    theme(axis.text.y = element_text(color = mycols, size = 15),
          axis.text.x = element_text(color = 'black', size = 15),
          axis.title = element_text(color = 'black', size = 15),
          axis.ticks.y = element_blank(),
          plot.title = element_text(hjust = 0.5, size = 15),
          plot.margin = margin(0, 0.5, 0.5, 0, "cm") ) +
    geom_vline(xintercept = 0, lty = 2) +
    #xlim(-0.5, xmax) +
    ylab('') +
    xlab('Simulated average predictive difference')
  return(pp)
}
  
get_frex_topics <- function(model, nwords = 15) {
  # Summary: Put together top frex words and topic numbers
  a <- labelTopics(model, n = nwords)
  a <- data.frame(t(as.matrix(a$frex)), stringsAsFactors = F)
  names(a) <- paste0('topic_', 1:ncol(a))
  return(a)
}


